# encoding:utf-8
# user: ares at 18-9-7
# 爬取链家网的租房信息    https://sz.lianjia.com/zufang/

import requests
from bs4 import BeautifulSoup
import time
from pymongo import MongoClient
import spderPublic

def getOnePage(url,header,zfList):
    wbdata=requests.get(url,headers=header).text
    soup=BeautifulSoup(wbdata,'html.parser')

    divs=soup.find_all('div',attrs={'class':'info-panel'})

    for div in divs:
        try:
            title=div.find('h2').text.strip()
            where=div.find('div',attrs={'class':'where'}).find_all('span')
            xiaoqu=where[0].text.strip()
            huxing=where[2].text.strip()
            mianji=where[3].text.strip()
            area=float(mianji[:len(mianji)-2])
            chaoxiang=where[4].text.strip()

            price=int(div.find('div',attrs={'class':'price'}).find('span').text.strip())

            durl=div.h2.a['href']
            diqu, jiedao, jtms=getDetailUrl(durl,header)

            zfDict={
                'title': title,
                'area': area,
                'diqu': diqu,
                'jiedao': jiedao,
                'xiaoqu':xiaoqu,
                'price': price,
                'jtms': jtms,
                'huxing': huxing,
                'chaoxiang': chaoxiang,
                'zutype': '',
                'laiyuan': '链家'
            }

            if zfDict not in zfList:        #增加去重复数据的处理
                zfList.append(zfDict)
        except:
            continue
    return zfList

def getDetailUrl(url,header):
    wbdata=requests.get(url,headers=header).text
    soup=BeautifulSoup(wbdata,'html.parser')
    divs=soup.find_all('div',attrs={'class':'zf-room'})

    ps=divs[0].find_all('p')
    diqu=ps[6].find_all('a')[0].text.strip()      #地区
    diqu=spderPublic.updateDist(diqu)
    jiedao=ps[6].find_all('a')[1].text.strip()      #街道
    jtms=ps[4].text.strip()                       #交通描述
    return diqu,jiedao,jtms

def insertData(datalist):
    #connect
    client=MongoClient('localhost:27017',connect=True)
    db=client['test']
    collection=db['zfdb']
    collection.insert(datalist)

if __name__ == '__main__':

    header={
        'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36',
        'Referer':'https://sz.lianjia.com/',
        'Connection':'keep-alive',
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'
    }
    datalist=[]

    start = time.perf_counter()
    print('开始爬取链家租房数据'.center(100 // 2, '-'))
    for i in range(1,101):
        url='https://sz.lianjia.com/zufang/pg'+str(i)
        datalist=getOnePage(url,header,datalist)

        a = '#' * i
        b = '-' * (100 - i)
        c = i
        dur = time.perf_counter() - start
        print('\r{:^3.0f}%[{}->{}]{:.2f}s'.format(c, a, b, dur), end='')

    insertData(datalist)

    print('\n'+'爬取链家租房数据结束'.center(100 // 2, '-'))